load("data/cleaned/transcripts_clean.RData", verbose = TRUE)

# Simple analyses - disagreement ----------------------------------------------------------------------------------------------------------------

# People less likely to disagree, when they actually select trans

transcripts_by_round <- translations_filled %>%
  group_by(group_id, pair_id, stratum_id) %>%
  filter(!is.na(pair_id)) %>%
  summarise(
    across(
      c(disagreement, pair_includes_trans, r1_choose_comparator, starts_pro_trans),
      ~ first_non_na(.x)
    )
  ) %>%
  mutate(r1_choose_comparator = as.logical(r1_choose_comparator))

transcripts_by_round %>% bar_chart(x = pair_includes_trans, y = disagreement, fill = r1_choose_comparator)

# Less disagreement in trans (5pp)
model_disagreement <- feols_custom(
  disagreement ~ pair_includes_trans,
  data = transcripts_by_round,
  fixef = c("stratum_id"),
  cluster = "group_id"
)

transcripts_by_round %>% filter(pair_includes_trans == 0) %>% get_mean(disagreement)
transcripts_by_round %>% filter(pair_includes_trans == 1) %>% get_mean(disagreement)

model_disagreement %>% get_coeff(term = "pair_includes_trans") %>% {. * -1} %>% times_100 %>% write_stat("outputs/stats/coeff_disagreement.tex")
model_disagreement %>% get_p_val(term = "pair_includes_trans") %>% write_stat("outputs/stats/pval_disagreement.tex")
# get_p
# People less likely to disagree, when first argument is pro-trans
transcripts_by_round %>% bar_chart(x = pair_includes_trans, y = disagreement, fill = starts_pro_trans)


# Dynamics of disagreement ----------------------------------------------------------------------------------------------------------------

pro_anti_dynamics <- translations_filled %>%
  filter(!is.na(pair_id), who_speaking_tr != "Lead", who_speaking_tr != "Other") %>%
  select(group_id, transcript_line_id, pair_id, stratum_id, pair_includes_trans, who_speaking_tr, pro_comparator_arg, pro_trans_arg) %>%
  filter(!is.na(pro_comparator_arg)) %>%
  group_by(group_id, pair_id) %>%
  mutate(
    argument_line_id = row_number(),
    start_pro_comparator = first_non_na(pro_comparator_arg)
  ) %>%
  ungroup

# ROBUSTNESS - for arguments that end early, fill out the rest of the argument with the last value - indeed, when I make this adjustment, the changes in slope go away

coeffs_pro_anti_dynamics <- pro_anti_dynamics %>%
  group_by(argument_line_id, pair_includes_trans) %>%
  nest() %>%
  mutate(
    model = map(data, safely(~ feols_custom(pro_comparator_arg ~ 1, data = .x))) %>% map("result"),
    coeffs = map(model, safely(~ tidy_90(.x))) %>% map("result")
  ) %>%
  unnest(coeffs)

coeffs_pro_anti_dynamics_w_start <- pro_anti_dynamics %>%
  group_by(argument_line_id, pair_includes_trans, start_pro_comparator) %>%
  nest() %>%
  mutate(
    model = map(data, safely(~ feols_custom(pro_comparator_arg ~ 1, data = .x))) %>% map("result"),
    coeffs = map(model, safely(~ tidy_90(.x))) %>% map("result")
  ) %>%
  unnest(coeffs) %>%
  bind_rows(
    tidyr::crossing(
      argument_line_id = 1,
      pair_includes_trans = c(0, 1),
      nesting(
        start_pro_comparator = c(0, 1),
        estimate = c(0, 1)
      )
    )
  )

# PLOT all intercept terms in a bar plot, with 95% CI, facet-wrap by pair_includes_trans
bind_rows(
  coeffs_pro_anti_dynamics %>% mutate(type = "all"),
  coeffs_pro_anti_dynamics_w_start %>% mutate(type = ifelse(start_pro_comparator == 1, "start pro", "start anti"))
) %>%
  mutate(
    pair_type = ifelse(pair_includes_trans==1, "trans", "non-trans")
  ) %>%
  filter(argument_line_id < 10) %>%
  ggplot(aes(x = argument_line_id, y = estimate, colour = factor(start_pro_comparator), ymin = conf.low, ymax = conf.high)) +
  geom_point() +
  geom_errorbar(width = 0.1) +
  facet_wrap(~ pair_type) +
  geom_line(linetype = "dashed")



# Alternative graph for dynamics of disagreement ----------------------------------------------------------------------------------------------------------------

dynamics_raw <- translations_filled %>%
  filter(!is.na(pair_id), who_speaking_tr != "Lead", who_speaking_tr != "Other") %>%
  select(group_id, transcript_line_id, pair_id, stratum_id, pair_includes_trans, who_speaking_tr, pro_trans_arg) %>%
  filter(pair_includes_trans == 1) %>%
  group_by(group_id, pair_id) %>%
  mutate(
    argument_line_id = row_number()
  ) %>%
  ungroup

# Analysis - prosocial arguments ----------------------------------------------------------------------------------------------------------------

mean_ci_feols <- function(data, y, fixef = "stratum_id", cluster = "group_id") {
  feols_custom(
    as.formula(str_glue("{y} ~ 1")),
    data = data
  ) %>%
    tidy_90()
}


dynamics_coeffs <- dynamics_raw %>%
  arrange(group_id, pair_id, argument_line_id) %>%

  filter(!is.na(pro_trans_arg)) %>%
  mutate(arg_type = replace_na(as.character(pro_trans_arg), "NEITHER")) %>%
  mutate(
    arg_pro_trans = arg_type == "TRUE",
    arg_anti_trans = arg_type == "FALSE",
    arg_neither = arg_type == "NEITHER"
  ) %>%

  group_by(group_id) %>%
  mutate(pair_id = ifelse(pair_id == min_na(pair_id), 1, 2)) %>%

  group_by(pair_id, argument_line_id) %>%
  nest() %>%
  tidyr::crossing(
    y = c("arg_pro_trans", "arg_anti_trans", "arg_neither")
  ) %>%
  mutate(
    coeffs = map2(data, y, safely(~ mean_ci_feols(.x, .y))) %>% map("result")
  ) %>%
  mutate(pair_id = str_glue("Discussion {pair_id}")) %>%
  unnest(coeffs)

dynamics_coeffs %>%
  filter(argument_line_id < 20) %>%
  ggplot(aes(x = argument_line_id, y=estimate, ymin = conf.low, ymax = conf.high)) +
  geom_point(aes(color = y), ) +
  geom_line(aes(color = y), alpha = 0.4) +
  geom_ribbon(aes(group = y, fill = y), alpha = 0.02, linetype = "dashed") +
  theme_classic() +
  facet_wrap(~pair_id)


ggsave("outputs/figs/discussion_dynamics_pro_anti.pdf", width = 7, height = 5)

# World cloud (main) ----------------------------------------------------------------------------------------------------------------

# Load required libraries
# Assuming your dataset is already loaded as translations_merged


# Prepare transcript data for word cloud visualization
# This script processes transcript data to analyze word frequencies between trans and non-trans speakers
word_cloud_data <- translations_filled %>%
  # Remove speeches from 'Lead' and 'Other' speakers
  filter(who_speaking != "Lead", who_speaking != "Other") %>%

  # Tokenize the English speech text into individual words
  unnest_tokens(input = speech_english, output = word, token = "words") %>%

  # Remove specific word "line" (likely a transcript artifact)
  filter(word != "line") %>%

  # Remove common contractions that might interfere with analysis
  mutate(word = str_replace_all(word, "’", "'")) %>%
  filter(!word %in% c("don't", "wouldn't", "shouldn't", "won't")) %>%

  # Remove stop words and transcript-specific terms
  filter(!(word %in% stop_words$word),
         !(word %in% c("inaudible", "iinaudible", "it's", "it's", "it’s", "let's",  "na"))) %>%

  # Normalize terms for consistency
  # Convert plurals and variations to singular forms
  mutate(word = word %>%
    str_replace("opportunities", "opportunity") %>%
    str_replace("jobs", "job") %>%
    str_replace("equally|equality", "equal") %>%
    str_replace("train$", "trains") %>%
    str_replace("right$", "rights") %>%
    str_replace("humans", "human")
  ) %>%

  # Count word frequencies by treatment
  group_by(pair_includes_trans, word) %>%
  count() %>%

  # Calculate total words for each treatment
  group_by(pair_includes_trans) %>%
  mutate(total_words = sum(n)) %>%

  # Add readable labels for trans/non-trans groups
  mutate(pair_includes_trans_label = ifelse(pair_includes_trans == 1, "trans", "non_trans")) %>%

  # Remove any rows with NA in pair_includes_trans
  filter(!is.na(pair_includes_trans)) %>%
  ungroup %>%

  # Calculate proportion of each word within its speaker type
  mutate(p = n / total_words) %>%

  # Clean up intermediate columns
  select(-pair_includes_trans, -total_words) %>%

  # Reshape data to compare trans vs non-trans usage
  pivot_wider(
    names_from = pair_includes_trans_label,
    values_from = c(p, n),
    values_fill = list(p = 0, n = 0)  # Replace NA with 0 for counts and proportions
  ) %>%

  # Calculate comparative metrics
  mutate(
    # Difference in proportions between trans and non-trans usage
    p_diff = p_trans - p_non_trans,
    # Ratio of proportions (NA if non-trans proportion is 0)
    p_ratio = ifelse(p_non_trans == 0, NA_real_, p_trans / p_non_trans)
  ) %>%

  # Sort by difference in proportions
  arrange(desc(p_diff)) %>%

  # Apply specific adjustment to 'transgender' word frequency
  mutate(p_diff = ifelse(word == "transgender", p_diff / 2.45, p_diff)) %>%

  # Remove the word 'transgender' from final dataset
  filter(word != "transgender")

palette <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
             "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf")

set.seed(99999)
# Trans-related words
pdf("outputs/figs/word_cloud.pdf")
wordcloud::wordcloud(words = word_cloud_data$word, freq = word_cloud_data$p_diff,
                     scale = c(5.5, 1),
                     max.words = 40, random.order = FALSE, colors = palette)
dev.off()

# Non trans words
wordcloud::wordcloud(words = word_cloud_data$word, freq = - word_cloud_data$p_diff,
                     scale = c(5.5, 1),
                     max.words = 40, random.order = FALSE, colors = palette)


# Word cloud - control videos only ----------------------------------------------------------------------------------------------------------------


world_cloud_data_control_vid <- translations_filled %>%

  # Keep only control videos
  tidylog::left_join(
    df %>% select(group_id, video_type) %>% dups_drop(), by = c("group_id")
  ) %>%
  filter(video_type == "control") %>%

  # Remove speeches from 'Lead' and 'Other' speakers
  filter(who_speaking != "Lead", who_speaking != "Other") %>%

  # Tokenize the English speech text into individual words
  unnest_tokens(input = speech_english, output = word, token = "words") %>%

  # Remove specific word "line" (likely a transcript artifact)
  filter(word != "line") %>%

  # Remove common contractions that might interfere with analysis
  mutate(word = str_replace_all(word, "’", "'")) %>%
  filter(!word %in% c("don't", "wouldn't", "shouldn't", "won't")) %>%

  # Remove stop words and transcript-specific terms
  filter(!(word %in% stop_words$word),
         !(word %in% c("inaudible", "iinaudible", "it's", "it's", "it’s", "let's",  "na"))) %>%

  # Normalize terms for consistency
  # Convert plurals and variations to singular forms
  mutate(word = word %>%
    str_replace("opportunities", "opportunity") %>%
    str_replace("jobs", "job") %>%
    str_replace("equally|equality", "equal") %>%
    str_replace("train$", "trains") %>%
    str_replace("right$", "rights") %>%
    str_replace("humans", "human")
  ) %>%

  # Count word frequencies by treatment
  group_by(pair_includes_trans, word) %>%
  count() %>%

  # Calculate total words for each treatment
  group_by(pair_includes_trans) %>%
  mutate(total_words = sum(n)) %>%

  # Add readable labels for trans/non-trans groups
  mutate(pair_includes_trans_label = ifelse(pair_includes_trans == 1, "trans", "non_trans")) %>%

  # Remove any rows with NA in pair_includes_trans
  filter(!is.na(pair_includes_trans)) %>%
  ungroup %>%
  mutate(p = n / total_words) %>%
  select(-pair_includes_trans, -total_words) %>%
  pivot_wider(
    names_from = pair_includes_trans_label,
    values_from = c(p, n),
    values_fill = list(p = 0, n = 0)  # Replace NA with 0 for counts and proportions
  ) %>%

  # Calculate comparative metrics
  mutate(
    p_diff = p_trans - p_non_trans,
    p_ratio = ifelse(p_non_trans == 0, NA_real_, p_trans / p_non_trans)
  ) %>%
  arrange(desc(p_diff)) %>%
  mutate(p_diff = ifelse(word == "transgender", p_diff / 2.45, p_diff)) %>%
  filter(word != "transgender")


set.seed(99999)
# Trans-related words
pdf("outputs/figs/word_cloud_vid.pdf")
wordcloud::wordcloud(words = world_cloud_data_control_vid$word, freq = word_cloud_data$p_diff,
                     scale = c(5.5, 1),
                     max.words = 40, random.order = FALSE, colors = palette)
dev.off()



# Other discussions - caste ----------------------------------------------------------------------------------------------------------------

# Create a vector of caste-related terms
caste_terms <- c(
  # General caste system terms
  "caste", "varna", "jati",
  # Specific caste groups
  "brahmin", "brahmana", "brahman",
  "kshatriya", "vaishya", "shudra",
  "dalit", "harijan", "untouchable",
  # Related terms
  "scheduled caste", "scheduled tribe",
  "obc", "other backward class",
  # Discriminatory practices
  "endogamy", "inter-caste", "intercaste"
)

# Search for caste-related terms in transcripts
caste_mentions <- translations_filled %>%
  # Create a flag for any caste-related terms
  mutate(
    contains_caste_term = str_detect(
      tolower(speech_english),  # Convert to lowercase for case-insensitive matching
      paste(caste_terms, collapse = "|")  # Combine terms with OR operator
    )
  ) %>%
  # Filter to only show instances where caste terms appear
  filter(contains_caste_term) %>%
  # Select relevant columns
  select(
    who_speaking,
    speech_english,
    contains_caste_term
  ) %>%
  # Add a column showing which specific terms were found
  mutate(
    matched_terms = sapply(
      tolower(speech_english),
      function(text) {
        matches <- caste_terms[str_detect(text, caste_terms)]
        paste(unique(matches), collapse = ", ")
      }
    )
  )

# View results
print(paste("Number of speeches containing caste-related terms:", nrow(caste_mentions)))
print(caste_mentions)


# Other discussions - income ----------------------------------------------------------------------------------------------------------------

# Create vectors of income and poverty-related terms
income_poverty_terms <- c(
  # Income terms
  "income", "salary", "wage", "wages", "earnings", "money",
  "rich", "wealthy", "affluent", "well-off",
  "middle class", "upper class", "lower class",

  # Poverty terms
  "poor", "poverty", "impoverished", "destitute",
  "underprivileged", "disadvantaged", "marginalized",
  "below poverty", "poverty line",
  "economic hardship", "financial hardship",
  "financial struggle", "economic struggle",

  # Economic status indicators
  "slum", "homeless", "housing", "shelter",
  "unemployed", "unemployment", "jobless",
  "food insecurity", "hunger", "starvation",
  "basic needs", "necessities",

  # Financial terms
  "debt", "loan", "credit", "savings",
  "expenses", "costs", "bills", "rent",
  "afford", "expensive", "cheap",

  # Economic mobility
  "social mobility", "economic mobility",
  "opportunity", "advancement", "upliftment",

  # Aid/support terms
  "welfare", "benefits", "assistance",
  "subsidy", "subsidies", "ration",
  "pension", "scholarship",

  # Specific Indian context
  "bpl", "apl",  # Below Poverty Line, Above Poverty Line
  "ration card",
  "antyodaya", # Scheme for the poorest of the poor
  "MNREGA", "NREGA", # Employment guarantee scheme
  "jan dhan", # Financial inclusion program
  "ayushman", # Healthcare scheme
  "ujjwala"  # LPG connection scheme
) %>%
  paste0("\\b", ., "\\b")  # Add word boundaries to match whole words only

# Search for income/poverty-related terms in transcripts
income_poverty_mentions <- translations_filled %>%
  filter(who_speaking != "Lead", who_speaking != "Other") %>%
  # Create a flag for any income/poverty-related terms
  mutate(
    contains_income_poverty_term = str_detect(
      tolower(speech_english),  # Convert to lowercase for case-insensitive matching
      paste(income_poverty_terms, collapse = "|")  # Combine terms with OR operator
    )
  ) %>%
  # Filter to only show instances where terms appear
  filter(contains_income_poverty_term) %>%
  # Select relevant columns
  select(
    who_speaking,
    speech_english,
    contains_income_poverty_term
  ) %>%
  # Add a column showing which specific terms were found
  mutate(
    matched_terms = sapply(
      tolower(speech_english),
      function(text) {
        matches <- income_poverty_terms[str_detect(text, income_poverty_terms)]
        paste(unique(matches), collapse = ", ")
      }
    ) %>%
      str_replace_all("\\\\b", "")  # Remove word boundaries for display
  )

# View results
print(paste("Number of speeches containing income/poverty-related terms:", nrow(income_poverty_mentions)))
print(income_poverty_mentions, n = 40)

# Optional: Create a frequency table of most commonly used terms
term_frequency <- income_poverty_mentions %>%
  mutate(term_list = strsplit(matched_terms, ", ")) %>%
  unnest(term_list) %>%
  count(term_list, sort = TRUE) %>%
  rename(term = term_list, frequency = n)

print("Most frequently used terms:")
print(head(term_frequency, 20))


# Check LEAD doens't mention transgender ----------------------------------------------------------------------------------------------------------------


lead_said_trans <- translations_filled %>%
  filter(!is.na(speech_english)) %>%
  mutate(said_trans = speech_english %>% str_detect(regex("trans", ignore_case = TRUE))) %>%
  mutate(
    participant_said_trans = who_speaking_tr != "Lead" & said_trans,
    lead_said_trans = who_speaking_tr == "Lead" & said_trans
  ) %>%
  group_by(group_id) %>%

  # Indicator for whether lead_said_trans is true BEFORE participant_said_trans is true
  mutate(
    lead_cumsum = cumsum(lead_said_trans),
    participant_cumsum = cumsum(participant_said_trans),
    lead_before_participant = lead_cumsum > 0 & participant_cumsum == 0
  ) %>%
  summarise(lead_said_trans = sum_na(lead_said_trans) > 0,
            lead_said_trans_before_participant = sum_na(lead_before_participant) > 0) %>%
  count_prop(lead_said_trans, lead_said_trans_before_participant)

feols_custom(
  r2_choose_trans ~ lead_said_trans_before_participant,
  data = r2_with_relations %>% left_join(lead_said_trans, by = "group_id") %>% filter(discuss_type == "discussion_full"),
  fixef = c("stratum_id", "video_type", "phase"),
  cluster = "group_id"
)

# TYPE OF REASON plot, and pro vs trans ----------------------------------------------------------------------------------------------------------------

transcripts_by_reason <- translations_filled %>%
  separate_rows(reasons_joined, sep = "; ") %>%
  group_by(reasons_joined) %>%
  mutate(
    reasons_joined = case_when(
      str_detect(reasons_joined, "Person seems reliable") ~ "Person seems reliable etc.",
      TRUE ~ reasons_joined
    )
  ) %>%
  summarise(
    n = sum(!is.na(pro_trans_arg)),
    n_pro_trans = sum_na(pro_trans_arg),
    n_anti_trans = sum_na(!pro_trans_arg)
  ) %>%
  ungroup %>%
  mutate(
    p_pro_trans = n_pro_trans / n,
    p_anti_trans = n_anti_trans / n,
    n_anti_trans_neg = - n_anti_trans,
    # Calculate total n for sorting
    total_n = n_pro_trans + n_anti_trans,
    # Create factor with levels ordered by total_n
    reasons_joined = factor(reasons_joined,
                            levels = reasons_joined[order(p_anti_trans)])
  ) %>%
  filter(n > 10) %>%
  print(n = 100)

# Create the plot
ggplot(transcripts_by_reason) +
  # Add bars for pro_trans (positive)
  geom_bar(aes(x = reasons_joined, y = n_pro_trans),
           stat = "identity",
           fill = "#4CAF50",  # Nice shade of green
           width = 0.7) +
  # Add bars for anti_trans (negative)
  geom_bar(aes(x = reasons_joined, y = n_anti_trans_neg),
           stat = "identity",
           fill = "indianred",
           width = 0.7) +
  # Customize the theme and labels
  theme_minimal() +
  labs(title = "Pro-Trans vs Anti-Trans Responses by Reason (hand-coded)",
       x = "Reason",
       y = "Count") +
  # Rotate x-axis labels for better readability
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  # Ensure the y-axis is centered at 0
  scale_y_continuous(labels = abs)

ggsave("outputs/figs/pro_anti_reasons.pdf", width = 7, height = 7)


# Proportion of positive/anti-trans mentions ----------------------------------------------------------------------------------------------

translations_filled %>% glimpse

transcripts_trimmed <- translations_filled %>% 
filter(who_speaking != "Lead", who_speaking != "Other") %>% 
filter(pair_includes_trans == 1) %>% 
count_prop(pro_trans_arg)

mean_pro_trans <- transcripts_trimmed %>% mutate(pro_trans_arg = pro_trans_arg %in% TRUE) %>% get_mean("pro_trans_arg")
mean_anti_trans <- transcripts_trimmed %>% mutate(anti_trans_arg = pro_trans_arg %in% FALSE) %>% get_mean("anti_trans_arg")

ratio_pro_anti <- mean_pro_trans / mean_anti_trans

ratio_pro_anti %>% write_stat("outputs/stats/ratio_pro_anti_transcripts.tex", digits = 1)
mean_pro_trans %>% write_percentage("outputs/stats/mean_pro_transcripts.tex")
mean_anti_trans %>% write_percentage("outputs/stats/mean_anti_transcripts.tex")

feols_custom(pro_trans_arg ~ 1, data = transcripts_trimmed, cluster = "group_id") %>% 
get_p_val("(Intercept)") %>% 
write_p_val("outputs/stats/pval_pro_transcripts.tex")
